import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from umap.umap_ import UMAP
import plotly.express as px
# PCA
#from sklearn.decomposition import PCA
# Library for t-SNE projections
#from sklearn.manifold import TSNE
2022-11-17 11:56:32.243397: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
# Load metadata
meta = pd.read_csv('lgg_lb_meta.csv')
meta = meta.set_index(['SDG_ID'])
#meta
# Load gene expression matrix
df = pd.read_csv('lb_plasma_matrix.csv')
tdf = df.T
tdf.columns = tdf.iloc[0]
tdf = tdf[1:]
#tdf
# Merge metadata and gene expression matrix
main_df = pd.concat([meta, tdf], axis=1, join="inner")
# NaN columns are introduced from merge, so they are dropped
main_df = main_df.iloc[:,:2108]
# Drop the control and housekeeping miRNA genes
main_df = main_df.drop(['CTRL_ANT1','CTRL_ANT2','CTRL_ANT3','CTRL_ANT4','CTRL_ANT5','CTRL_miR_POS','HK_ACTB',
'HK_B2M','HK_GAPDH','HK_PPIA','HK_RNU47','HK_RNU75','HK_RNY3','HK_RPL19','HK_RPL27',
'HK_RPS12','HK_RPS20','HK_SNORA66','HK_YWHAZ'], axis = 1)
# Add the 'SDG_ID' column back
main_df = main_df.reset_index()
main_df.rename(columns = {'index':'SDG_ID'}, inplace = True)
#main_df
lgg_relapse_df = main_df[main_df['Short_Histology'] == 'LGG']
lgg_relapse_df
| SDG_ID | Specimen_Type | Diagnosis | Short_Histology | Tumor_Subtype | Relapse | Survival_Status | let-7a-2-3p | let-7a-3p | let-7a-5p | ... | miR-944 | miR-95-3p | miR-95-5p | miR-9-5p | miR-96-3p | miR-96-5p | miR-98-3p | miR-99a-5p | miR-99b-3p | miR-99b-5p | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 15635-37 | CSF, Plasma | Low-grade glioma/astrocytoma (WHO grade I/II) | LGG | pilocytic astrocytoma | Yes | Alive | 178.0 | 69.0 | 7346.0 | ... | 23.0 | 44.0 | 31.0 | 19.0 | 21.0 | 125.0 | 38.0 | 891.0 | 75.0 | 290.0 |
| 3 | 15635-46 | CSF, Plasma | Low-grade glioma/astrocytoma (WHO grade I/II) | LGG | pilocytic astrocytoma | No | Alive | 480.0 | 43.0 | 8435.0 | ... | 11.0 | 99.0 | 172.0 | 67.0 | 35.0 | 62.0 | 118.0 | 1972.0 | 143.0 | 596.0 |
| 5 | 15635-60 | CSF, Plasma | Low-grade glioma/astrocytoma (WHO grade I/II) | LGG | pilocytic astrocytoma | No | Alive | 740.0 | 73.0 | 1372.0 | ... | 53.0 | 98.0 | 127.0 | 28.0 | 104.0 | 20.0 | 123.0 | 1020.0 | 174.0 | 370.0 |
| 6 | 15635-68 | CSF, Plasma | Low-grade glioma/astrocytoma (WHO grade I/II) | LGG | pilocytic astrocytoma | No | Alive | 481.0 | 79.0 | 18243.0 | ... | 61.0 | 190.0 | 162.0 | 98.0 | 75.0 | 256.0 | 67.0 | 1842.0 | 168.0 | 1263.0 |
| 7 | 15635-80 | CSF, Plasma | Low-grade glioma/astrocytoma (WHO grade I/II) | LGG | pilocytic astrocytoma | No | Alive | 3358.0 | 89.0 | 3032.0 | ... | 68.0 | 82.0 | 142.0 | 65.0 | 55.0 | 263.0 | 37.0 | 377.0 | 217.0 | 960.0 |
| 9 | 15635-90 | Plasma | Low-grade glioma/astrocytoma (WHO grade I/II) | LGG | pilocytic astrocytoma | Yes | Alive | 305.0 | 81.0 | 5942.0 | ... | 40.0 | 84.0 | 56.0 | 81.0 | 73.0 | 168.0 | 46.0 | 3492.0 | 155.0 | 282.0 |
| 10 | 15635-100 | CSF, Plasma | Low-grade glioma/astrocytoma (WHO grade I/II) | LGG | pilocytic astrocytoma | Yes | Alive | 353.0 | 26.0 | 26638.0 | ... | 0.0 | 57.0 | 62.0 | 24.0 | 22.0 | 120.0 | 72.0 | 3586.0 | 73.0 | 919.0 |
| 11 | 15635-101 | CSF, Plasma | Low-grade glioma/astrocytoma (WHO grade I/II) | LGG | pilocytic astrocytoma | No | Alive | 241.0 | 70.0 | 5121.0 | ... | 39.0 | 62.0 | 92.0 | 39.0 | 27.0 | 126.0 | 33.0 | 1508.0 | 64.0 | 254.0 |
| 12 | 15635-127 | Plasma | Low-grade glioma/astrocytoma (WHO grade I/II) | LGG | pilocytic astrocytoma | Yes | Alive | 201.0 | 63.0 | 9160.0 | ... | 7.0 | 28.0 | 50.0 | 38.0 | 10.0 | 113.0 | 27.0 | 865.0 | 44.0 | 335.0 |
| 13 | 15635-134 | Plasma | Low-grade glioma/astrocytoma (WHO grade I/II) | LGG | pilocytic/pilomyxoid astrocytoma | Yes | Alive | 345.0 | 93.0 | 19679.0 | ... | 60.0 | 147.0 | 104.0 | 40.0 | 31.0 | 180.0 | 75.0 | 2621.0 | 154.0 | 1349.0 |
10 rows × 2090 columns
# Store Sample IDs as a list
lgg_relapse_sample_ids = lgg_relapse_df.index.to_list()
# Create ML-ready dataframe
relapse_df = lgg_relapse_df.drop(['SDG_ID', 'Specimen_Type', 'Diagnosis', 'Short_Histology', 'Tumor_Subtype', 'Survival_Status'],axis=1)
#relapse_df
lgg_relapse_id = lgg_relapse_df['SDG_ID'].tolist()
lgg_relapse_class = lgg_relapse_df['Relapse'].tolist()
# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(relapse_df.iloc[:,1:])
umap_2d_df = pd.DataFrame(proj_2d, lgg_relapse_id).reset_index()
umap_2d_df.columns = ['SDG_ID', 'UMAP1', 'UMAP2']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP Plot of LGG Relapse',
color=lgg_relapse_class, hover_data=['SDG_ID'])
fig_2d.show()
/Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/umap/umap_.py:2344: UserWarning: n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1 warn(
# Split the dataset into features and labels
X = relapse_df.loc[:, relapse_df.columns != 'Relapse'].values
y = relapse_df.loc[:, relapse_df.columns == 'Relapse'].values.ravel()
# Split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#Sanity check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(7, 2083) (3, 2083) (7,) (3,)
# Initialize random forest classifier
rfc = RandomForestClassifier(max_depth=2, random_state=0)
# Train the random forest classifier
rfc.fit(X_train, y_train)
# Make predictions using random forest classifier
y_pred = rfc.predict(X_test)
# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
Accuracy: 0.6666666666666666
# Calculate a confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=rfc.classes_)
# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
labels=dict(x="Predicted Relapse", y="True Relapse", color="Productivity"),
x=relapse_df['Relapse'].unique().tolist(),
y=relapse_df['Relapse'].unique().tolist()
)
disp.show()
print(X_test)
print('True Relapse Label:')
print(y_test)
print('Predicted Relapse:')
print(y_pred)
[[201.0 63.0 9160.0 ... 865.0 44.0 335.0] [480.0 43.0 8435.0 ... 1972.0 143.0 596.0] [305.0 81.0 5942.0 ... 3492.0 155.0 282.0]] True Relapse Label: ['Yes' 'No' 'Yes'] Predicted Relapse: ['Yes' 'No' 'No']
# What are the most important features?
feature_list = relapse_df.columns
feature_list = feature_list.drop('Relapse')
imp_features = pd.Series(rfc.feature_importances_, index=feature_list)
imp_genes = imp_features.sort_values(ascending=False).to_frame().reset_index()
imp_genes.columns = ["features", "importance"]
imp_genes_fil = imp_genes[~(imp_genes == 0.000000).any(axis=1)]
imp_genes_fil
| features | importance | |
|---|---|---|
| 0 | miR-612 | 0.020833 |
| 1 | miR-378j | 0.020833 |
| 2 | miR-4685-3p | 0.010417 |
| 3 | miR-5704 | 0.010417 |
| 4 | miR-6752-5p | 0.010417 |
| ... | ... | ... |
| 89 | miR-3605-5p | 0.010417 |
| 90 | miR-100-5p | 0.010417 |
| 91 | let-7b-5p | 0.010417 |
| 92 | miR-323b-5p | 0.010417 |
| 93 | miR-6505-5p | 0.010417 |
94 rows × 2 columns
# Display interactive Barplot of important miRNAs
fig = px.bar(imp_genes_fil, x=imp_genes_fil.features, y=imp_genes_fil.importance)
fig.show()
# Create a dataframe containing only ML selected genes
ml_relapse_df = relapse_df.filter(imp_genes_fil['features'].to_list()[0:6])
# UMAP Plot
umap_2d = UMAP(n_components=2, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_relapse_df)
umap_2d_df = pd.DataFrame(proj_2d, lgg_relapse_id).reset_index()
umap_2d_df.columns = ['SDG_ID', 'UMAP1', 'UMAP2']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP Plot for using 6 miRNA genes found by RF (Relapse)',
color=lgg_relapse_class, hover_data=['SDG_ID'])
fig_2d.show()
/Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/umap/umap_.py:2344: UserWarning: n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1
3-9 miRNA genes are enough to separate out LGG relapses.
print('Name of the 9 miRNAs: ', imp_genes_fil['features'].to_list()[0:9])
Name of the 9 miRNAs: ['miR-612', 'miR-378j', 'miR-4685-3p', 'miR-5704', 'miR-6752-5p', 'miR-302d-5p', 'miR-4276', 'miR-302c-5p', 'miR-4500']